import pandas as pd
import matplotlib.pyplot as plt
# Importing libraries for data preprocessing and clustering
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')
# Read the dataset from a CSV file into a pandas DataFrame
df_pre = pd.read_csv("dataset.csv")
df_pre
| Unnamed: 0 | track_id | artists | album_name | track_name | popularity | duration_ms | explicit | danceability | energy | ... | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | time_signature | track_genre | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 5SuOikwiRyPMVoIQDJUgSV | Gen Hoshino | Comedy | Comedy | 73 | 230666 | False | 0.676 | 0.4610 | ... | -6.746 | 0 | 0.1430 | 0.0322 | 0.000001 | 0.3580 | 0.7150 | 87.917 | 4 | acoustic |
| 1 | 1 | 4qPNDBW1i3p13qLCt0Ki3A | Ben Woodward | Ghost (Acoustic) | Ghost - Acoustic | 55 | 149610 | False | 0.420 | 0.1660 | ... | -17.235 | 1 | 0.0763 | 0.9240 | 0.000006 | 0.1010 | 0.2670 | 77.489 | 4 | acoustic |
| 2 | 2 | 1iJBSr7s7jYXzM8EGcbK5b | Ingrid Michaelson;ZAYN | To Begin Again | To Begin Again | 57 | 210826 | False | 0.438 | 0.3590 | ... | -9.734 | 1 | 0.0557 | 0.2100 | 0.000000 | 0.1170 | 0.1200 | 76.332 | 4 | acoustic |
| 3 | 3 | 6lfxq3CG4xtTiEg7opyCyx | Kina Grannis | Crazy Rich Asians (Original Motion Picture Sou... | Can't Help Falling In Love | 71 | 201933 | False | 0.266 | 0.0596 | ... | -18.515 | 1 | 0.0363 | 0.9050 | 0.000071 | 0.1320 | 0.1430 | 181.740 | 3 | acoustic |
| 4 | 4 | 5vjLSffimiIP26QG5WcN2K | Chord Overstreet | Hold On | Hold On | 82 | 198853 | False | 0.618 | 0.4430 | ... | -9.681 | 1 | 0.0526 | 0.4690 | 0.000000 | 0.0829 | 0.1670 | 119.949 | 4 | acoustic |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 113995 | 113995 | 2C3TZjDRiAzdyViavDJ217 | Rainy Lullaby | #mindfulness - Soft Rain for Mindful Meditatio... | Sleep My Little Boy | 21 | 384999 | False | 0.172 | 0.2350 | ... | -16.393 | 1 | 0.0422 | 0.6400 | 0.928000 | 0.0863 | 0.0339 | 125.995 | 5 | world-music |
| 113996 | 113996 | 1hIz5L4IB9hN3WRYPOCGPw | Rainy Lullaby | #mindfulness - Soft Rain for Mindful Meditatio... | Water Into Light | 22 | 385000 | False | 0.174 | 0.1170 | ... | -18.318 | 0 | 0.0401 | 0.9940 | 0.976000 | 0.1050 | 0.0350 | 85.239 | 4 | world-music |
| 113997 | 113997 | 6x8ZfSoqDjuNa5SVP5QjvX | Cesária Evora | Best Of | Miss Perfumado | 22 | 271466 | False | 0.629 | 0.3290 | ... | -10.895 | 0 | 0.0420 | 0.8670 | 0.000000 | 0.0839 | 0.7430 | 132.378 | 4 | world-music |
| 113998 | 113998 | 2e6sXL2bYv4bSz6VTdnfLs | Michael W. Smith | Change Your World | Friends | 41 | 283893 | False | 0.587 | 0.5060 | ... | -10.889 | 1 | 0.0297 | 0.3810 | 0.000000 | 0.2700 | 0.4130 | 135.960 | 4 | world-music |
| 113999 | 113999 | 2hETkH7cOfqmz3LqZDHZf5 | Cesária Evora | Miss Perfumado | Barbincor | 22 | 241826 | False | 0.526 | 0.4870 | ... | -10.204 | 0 | 0.0725 | 0.6810 | 0.000000 | 0.0893 | 0.7080 | 79.198 | 4 | world-music |
114000 rows × 21 columns
# Checking the shape of the DataFrame
df_pre.shape
(114000, 21)
Several genres have been identified for exclusion from the dataset. These genres are selected on random and mostly slightly different genre of similar sound songs were dropped.
# List of genres to be dropped from the DataFrame
genres_drop = ['ska', 'trip-hop', 'grindcore', 'death-metal', 'metalcore', 'honky-tonk','detroit-techno', 'black-metal', 'new-age', 'sertanejo',
'world-music', 'singer-songwriter', 'j-idol', 'j-dance', 'j-rock', 'j-pop','indian', 'bluegrass', 'breakbeat', 'chicago-house',
'malay', 'cantopop', 'mandopop', 'rockabilly', 'kids', 'children', 'german', 'progressive-house', 'hardstyle', 'minimal-techno',
'mpb', 'study', 'pop-film', 'pagode', 'turkish', 'tango', 'swedish', 'show-tunes', 'anime', 'power-pop', 'dub', 'idm', 'rock-n-roll',
'samba', 'sleep']
df = df_pre[~df_pre['track_genre'].isin(genres_drop)]
df
| Unnamed: 0 | track_id | artists | album_name | track_name | popularity | duration_ms | explicit | danceability | energy | ... | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | time_signature | track_genre | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 5SuOikwiRyPMVoIQDJUgSV | Gen Hoshino | Comedy | Comedy | 73 | 230666 | False | 0.676 | 0.4610 | ... | -6.746 | 0 | 0.1430 | 0.032200 | 0.000001 | 0.3580 | 0.7150 | 87.917 | 4 | acoustic |
| 1 | 1 | 4qPNDBW1i3p13qLCt0Ki3A | Ben Woodward | Ghost (Acoustic) | Ghost - Acoustic | 55 | 149610 | False | 0.420 | 0.1660 | ... | -17.235 | 1 | 0.0763 | 0.924000 | 0.000006 | 0.1010 | 0.2670 | 77.489 | 4 | acoustic |
| 2 | 2 | 1iJBSr7s7jYXzM8EGcbK5b | Ingrid Michaelson;ZAYN | To Begin Again | To Begin Again | 57 | 210826 | False | 0.438 | 0.3590 | ... | -9.734 | 1 | 0.0557 | 0.210000 | 0.000000 | 0.1170 | 0.1200 | 76.332 | 4 | acoustic |
| 3 | 3 | 6lfxq3CG4xtTiEg7opyCyx | Kina Grannis | Crazy Rich Asians (Original Motion Picture Sou... | Can't Help Falling In Love | 71 | 201933 | False | 0.266 | 0.0596 | ... | -18.515 | 1 | 0.0363 | 0.905000 | 0.000071 | 0.1320 | 0.1430 | 181.740 | 3 | acoustic |
| 4 | 4 | 5vjLSffimiIP26QG5WcN2K | Chord Overstreet | Hold On | Hold On | 82 | 198853 | False | 0.618 | 0.4430 | ... | -9.681 | 1 | 0.0526 | 0.469000 | 0.000000 | 0.0829 | 0.1670 | 119.949 | 4 | acoustic |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 110995 | 110995 | 7sLknEg8aVr0m5ZuCja7b3 | NG Rezonance;PHD | Syncopy Radio Edits, Vol. 1 | Divergence - Radio Edit | 28 | 185142 | False | 0.148 | 0.9930 | ... | -7.696 | 0 | 0.0922 | 0.009700 | 0.937000 | 0.0376 | 0.0928 | 140.001 | 4 | trance |
| 110996 | 110996 | 6veycwSGozeHSFQ6fbr5dC | NG Rezonance;PHD | Syncopy Radio Edits, Vol. 1 | Forgotten - Radio Edit | 28 | 193714 | False | 0.504 | 0.9850 | ... | -7.305 | 1 | 0.0504 | 0.000810 | 0.922000 | 0.1250 | 0.3830 | 139.978 | 4 | trance |
| 110997 | 110997 | 0MLEzWJQcRkc5IMAqucPbV | NG Rezonance;Begbie | Syncopy Radio Edits, Vol. 1 | Feel The Panic - Radio Edit | 28 | 209600 | False | 0.474 | 0.9950 | ... | -4.265 | 1 | 0.0979 | 0.000166 | 0.369000 | 0.1500 | 0.0634 | 150.002 | 4 | trance |
| 110998 | 110998 | 0cRNPYxzXLNLQd1g4kKYS6 | NG Rezonance | Syncopy Radio Edits, Vol. 1 | Fate - Instrumental Radio Edit | 28 | 134800 | False | 0.416 | 0.9810 | ... | -3.653 | 0 | 0.0943 | 0.000079 | 0.928000 | 0.1870 | 0.0662 | 150.054 | 3 | trance |
| 110999 | 110999 | 2dDE3WCSj2cELFYO1IfECD | NG Rezonance | Syncopy Radio Edits, Vol. 1 | Deception - NG Rezonance 2013 Radio Edit | 28 | 162206 | False | 0.469 | 0.9870 | ... | -5.525 | 0 | 0.0796 | 0.000055 | 0.932000 | 0.3080 | 0.2430 | 145.000 | 4 | trance |
69000 rows × 21 columns
df.shape
(69000, 21)
df.dropna(inplace=True)
df
| Unnamed: 0 | track_id | artists | album_name | track_name | popularity | duration_ms | explicit | danceability | energy | ... | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | time_signature | track_genre | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 5SuOikwiRyPMVoIQDJUgSV | Gen Hoshino | Comedy | Comedy | 73 | 230666 | False | 0.676 | 0.4610 | ... | -6.746 | 0 | 0.1430 | 0.032200 | 0.000001 | 0.3580 | 0.7150 | 87.917 | 4 | acoustic |
| 1 | 1 | 4qPNDBW1i3p13qLCt0Ki3A | Ben Woodward | Ghost (Acoustic) | Ghost - Acoustic | 55 | 149610 | False | 0.420 | 0.1660 | ... | -17.235 | 1 | 0.0763 | 0.924000 | 0.000006 | 0.1010 | 0.2670 | 77.489 | 4 | acoustic |
| 2 | 2 | 1iJBSr7s7jYXzM8EGcbK5b | Ingrid Michaelson;ZAYN | To Begin Again | To Begin Again | 57 | 210826 | False | 0.438 | 0.3590 | ... | -9.734 | 1 | 0.0557 | 0.210000 | 0.000000 | 0.1170 | 0.1200 | 76.332 | 4 | acoustic |
| 3 | 3 | 6lfxq3CG4xtTiEg7opyCyx | Kina Grannis | Crazy Rich Asians (Original Motion Picture Sou... | Can't Help Falling In Love | 71 | 201933 | False | 0.266 | 0.0596 | ... | -18.515 | 1 | 0.0363 | 0.905000 | 0.000071 | 0.1320 | 0.1430 | 181.740 | 3 | acoustic |
| 4 | 4 | 5vjLSffimiIP26QG5WcN2K | Chord Overstreet | Hold On | Hold On | 82 | 198853 | False | 0.618 | 0.4430 | ... | -9.681 | 1 | 0.0526 | 0.469000 | 0.000000 | 0.0829 | 0.1670 | 119.949 | 4 | acoustic |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 110995 | 110995 | 7sLknEg8aVr0m5ZuCja7b3 | NG Rezonance;PHD | Syncopy Radio Edits, Vol. 1 | Divergence - Radio Edit | 28 | 185142 | False | 0.148 | 0.9930 | ... | -7.696 | 0 | 0.0922 | 0.009700 | 0.937000 | 0.0376 | 0.0928 | 140.001 | 4 | trance |
| 110996 | 110996 | 6veycwSGozeHSFQ6fbr5dC | NG Rezonance;PHD | Syncopy Radio Edits, Vol. 1 | Forgotten - Radio Edit | 28 | 193714 | False | 0.504 | 0.9850 | ... | -7.305 | 1 | 0.0504 | 0.000810 | 0.922000 | 0.1250 | 0.3830 | 139.978 | 4 | trance |
| 110997 | 110997 | 0MLEzWJQcRkc5IMAqucPbV | NG Rezonance;Begbie | Syncopy Radio Edits, Vol. 1 | Feel The Panic - Radio Edit | 28 | 209600 | False | 0.474 | 0.9950 | ... | -4.265 | 1 | 0.0979 | 0.000166 | 0.369000 | 0.1500 | 0.0634 | 150.002 | 4 | trance |
| 110998 | 110998 | 0cRNPYxzXLNLQd1g4kKYS6 | NG Rezonance | Syncopy Radio Edits, Vol. 1 | Fate - Instrumental Radio Edit | 28 | 134800 | False | 0.416 | 0.9810 | ... | -3.653 | 0 | 0.0943 | 0.000079 | 0.928000 | 0.1870 | 0.0662 | 150.054 | 3 | trance |
| 110999 | 110999 | 2dDE3WCSj2cELFYO1IfECD | NG Rezonance | Syncopy Radio Edits, Vol. 1 | Deception - NG Rezonance 2013 Radio Edit | 28 | 162206 | False | 0.469 | 0.9870 | ... | -5.525 | 0 | 0.0796 | 0.000055 | 0.932000 | 0.3080 | 0.2430 | 145.000 | 4 | trance |
68999 rows × 21 columns
df.shape
(68999, 21)
df.describe()
| Unnamed: 0 | popularity | duration_ms | danceability | energy | key | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | time_signature | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 68999.000000 | 68999.000000 | 6.899900e+04 | 68999.000000 | 68999.000000 | 68999.000000 | 68999.000000 | 68999.000000 | 68999.000000 | 68999.000000 | 68999.000000 | 68999.000000 | 68999.000000 | 68999.000000 | 68999.000000 |
| mean | 51426.826476 | 33.580907 | 2.252585e+05 | 0.576138 | 0.645810 | 5.325584 | -7.996436 | 0.627009 | 0.088281 | 0.302674 | 0.129505 | 0.204778 | 0.477796 | 122.057113 | 3.911129 |
| std | 31499.289759 | 24.772096 | 9.418724e+04 | 0.168848 | 0.247882 | 3.556244 | 4.958435 | 0.483603 | 0.119065 | 0.329293 | 0.283917 | 0.180854 | 0.254506 | 29.876655 | 0.405602 |
| min | 0.000000 | 0.000000 | 8.586000e+03 | 0.000000 | 0.000020 | 0.000000 | -43.957000 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.009250 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 26249.500000 | 4.000000 | 1.755505e+05 | 0.465000 | 0.488000 | 2.000000 | -9.517500 | 0.000000 | 0.036200 | 0.018500 | 0.000000 | 0.096900 | 0.268000 | 98.299500 | 4.000000 |
| 50% | 46499.000000 | 35.000000 | 2.113410e+05 | 0.588000 | 0.694000 | 5.000000 | -6.669000 | 1.000000 | 0.049100 | 0.152000 | 0.000029 | 0.129000 | 0.468000 | 121.820000 | 4.000000 |
| 75% | 79749.500000 | 54.000000 | 2.560000e+05 | 0.703000 | 0.849000 | 8.000000 | -4.839000 | 1.000000 | 0.084500 | 0.558000 | 0.018200 | 0.258000 | 0.682500 | 140.044000 | 4.000000 |
| max | 110999.000000 | 100.000000 | 4.246206e+06 | 0.980000 | 1.000000 | 11.000000 | 4.532000 | 1.000000 | 0.965000 | 0.996000 | 0.999000 | 0.997000 | 0.995000 | 243.372000 | 5.000000 |
Overall, from this dataset, we can expect to see clusters of tracks with similar features, suggesting common musical genres or styles. For example, clustering analysis could be performed to identify groups of tracks with similar acoustic, energetic, or emotional characteristics, helping music streaming platforms recommend tracks to users based on their preferences.
Using seaborn to visualize the distribution of numerical features in the dataset. Each subplot represents the distribution of a specific numerical feature, with a histogram and kernel density estimation (kde).
import seaborn as sns
sns.set_style("darkgrid")
# Identifying numerical columns
numerical_columns = df.select_dtypes(include=["int64", "float64"]).columns
# Plotting distribution of each numerical feature
plt.figure(figsize=(14, len(numerical_columns) * 3))
for idx, feature in enumerate(numerical_columns, 1):
plt.subplot(len(numerical_columns), 2, idx)
sns.histplot(df[feature], kde=True)
plt.title(f"{feature} | Skewness: {round(df[feature].skew(), 2)}")
# To adjust layout and show plots
plt.tight_layout()
plt.show()
The plots provide an overview of the distribution of each numerical feature, facilitating the identification of potential skewness or anomalies. Skewness measures the asymmetry of the data distribution, with values close to zero indicating symmetry, and positive or negative values indicating right or left skew, respectively.
Features with significant skewness may require further preprocessing or transformation to enhance model performance in predictive tasks. In our case, we didn't use time_signature and mode for out model as evident from visualisation.
Aagain utilizes seaborn to create a pair plot for the DataFrame. Each scatterplot represents the relationship between two features, with one feature plotted against the other.
# Color palette for seaborn plots
sns.set_palette("Pastel1")
# Figure for the pair plot
plt.figure(figsize=(10, 6))
# Pair plot
sns.pairplot(df)
#TITLE
plt.suptitle('Pair Plot for DataFrame')
# Save as an image
plt.savefig('pair_plot.png')
# show
plt.show()
<Figure size 1000x600 with 0 Axes>
Scatterplots reveal the patterns, trends, and potential correlations between pairs of features. Diagonal plots show the distribution of individual features, while off-diagonal plots display the relationships between pairs of features.
The code generates a heatmap visualization of the correlation matrix for the numeric columns in the DataFrame. Each cell in the heatmap represents the correlation coefficient between two numeric features.
sns.set(style="whitegrid")
# Only numeric columns from the DataFrame
numeric_df = df.select_dtypes(include=['float64', 'int64'])
# Correlation matrix
correlation_matrix = numeric_df.corr()
colors = sns.color_palette('viridis')
# Plotting the Correlation Matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='viridis', fmt=".2f", linewidths=0.5, vmin=-1, vmax=1, cbar_kws={'label': 'Correlation'}, annot_kws={"size": 10}, square=True)
plt.title('Correlation Matrix', fontsize=16, fontweight='bold')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability if needed
plt.yticks(rotation=0)
plt.show()
The heatmap provides a visual representation of the strength and direction of the linear relationship between pairs of features. Correlation coefficients close to 1 or -1 indicate strong positive or negative correlations, respectively. Values close to 0 indicate weak or no correlation. We decided to: 'Unnamed: 0','popularity', 'instrumentalness', 'explicit', 'duration_ms', drop these columns based on the matrix.
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 68999 entries, 0 to 110999 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 68999 non-null int64 1 track_id 68999 non-null object 2 artists 68999 non-null object 3 album_name 68999 non-null object 4 track_name 68999 non-null object 5 popularity 68999 non-null int64 6 duration_ms 68999 non-null int64 7 explicit 68999 non-null bool 8 danceability 68999 non-null float64 9 energy 68999 non-null float64 10 key 68999 non-null int64 11 loudness 68999 non-null float64 12 mode 68999 non-null int64 13 speechiness 68999 non-null float64 14 acousticness 68999 non-null float64 15 instrumentalness 68999 non-null float64 16 liveness 68999 non-null float64 17 valence 68999 non-null float64 18 tempo 68999 non-null float64 19 time_signature 68999 non-null int64 20 track_genre 68999 non-null object dtypes: bool(1), float64(9), int64(6), object(5) memory usage: 11.1+ MB
Now the dataset contains 68,999 entries and 21 columns.
df = df.drop(['Unnamed: 0','popularity', 'instrumentalness', 'explicit', 'duration_ms', 'key'], axis=1)
df
df.shape
(68999, 15)
First feature scaling is applied to the selected numerical columns in the DataFrame using the StandardScaler from scikit-learn. It first initializes the scaler object and specifies the columns to be scaled. Then, it transforms the selected columns using the fit_transform method of the scaler object.
scaler = StandardScaler()
number_cols = ['valence', 'acousticness', 'danceability', 'energy', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo']
df[number_cols] = scaler.fit_transform(df[number_cols])
df
| track_id | artists | album_name | track_name | danceability | energy | loudness | mode | speechiness | acousticness | liveness | valence | tempo | time_signature | track_genre | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5SuOikwiRyPMVoIQDJUgSV | Gen Hoshino | Comedy | Comedy | 0.591433 | -0.745560 | 0.252185 | -1.296546 | 0.459572 | -0.821385 | 0.847224 | 0.932025 | -1.142710 | 4 | acoustic |
| 1 | 4qPNDBW1i3p13qLCt0Ki3A | Ben Woodward | Ghost (Acoustic) | Ghost - Acoustic | -0.924730 | -1.935649 | -1.863215 | 0.771280 | -0.100629 | 1.886861 | -0.573825 | -0.828263 | -1.491748 | 4 | acoustic |
| 2 | 1iJBSr7s7jYXzM8EGcbK5b | Ingrid Michaelson;ZAYN | To Begin Again | To Begin Again | -0.818125 | -1.157048 | -0.350428 | 0.771280 | -0.273644 | -0.281437 | -0.485355 | -1.405857 | -1.530474 | 4 | acoustic |
| 3 | 6lfxq3CG4xtTiEg7opyCyx | Kina Grannis | Crazy Rich Asians (Original Motion Picture Sou... | Can't Help Falling In Love | -1.836797 | -2.364888 | -2.121363 | 0.771280 | -0.436581 | 1.829161 | -0.402414 | -1.315485 | 1.997657 | 3 | acoustic |
| 4 | 5vjLSffimiIP26QG5WcN2K | Chord Overstreet | Hold On | Hold On | 0.247927 | -0.818176 | -0.339740 | 0.771280 | -0.299681 | 0.505103 | -0.673906 | -1.221184 | -0.070561 | 4 | acoustic |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 110995 | 7sLknEg8aVr0m5ZuCja7b3 | NG Rezonance;PHD | Syncopy Radio Edits, Vol. 1 | Divergence - Radio Edit | -2.535654 | 1.400634 | 0.060591 | -1.296546 | 0.032912 | -0.889714 | -0.924387 | -1.512732 | 0.600603 | 4 | trance |
| 110996 | 6veycwSGozeHSFQ6fbr5dC | NG Rezonance;PHD | Syncopy Radio Edits, Vol. 1 | Forgotten - Radio Edit | -0.427239 | 1.368360 | 0.139447 | 0.771280 | -0.318158 | -0.916711 | -0.441120 | -0.372474 | 0.599833 | 4 | trance |
| 110997 | 0MLEzWJQcRkc5IMAqucPbV | NG Rezonance;Begbie | Syncopy Radio Edits, Vol. 1 | Feel The Panic - Radio Edit | -0.604915 | 1.408702 | 0.752549 | 0.771280 | 0.080785 | -0.918667 | -0.302886 | -1.628251 | 0.935349 | 4 | trance |
| 110998 | 0cRNPYxzXLNLQd1g4kKYS6 | NG Rezonance | Syncopy Radio Edits, Vol. 1 | Fate - Instrumental Radio Edit | -0.948420 | 1.352224 | 0.875975 | -1.296546 | 0.050550 | -0.918932 | -0.098299 | -1.617249 | 0.937089 | 3 | trance |
| 110999 | 2dDE3WCSj2cELFYO1IfECD | NG Rezonance | Syncopy Radio Edits, Vol. 1 | Deception - NG Rezonance 2013 Radio Edit | -0.634527 | 1.376429 | 0.498434 | -1.296546 | -0.072913 | -0.919005 | 0.570755 | -0.922564 | 0.767926 | 4 | trance |
68999 rows × 15 columns
This function, optimise_k_means, aims to find the optimal number of clusters for K-means clustering by generating an elbow plot. It iterates over a range of cluster numbers from 1 to max_k, fits a K-means model for each cluster number, and calculates the inertia (within-cluster sum of squared distances to the nearest cluster center). The inertia is then plotted against the number of clusters to visualize the elbow point, which represents the optimal number of clusters where the inertia starts to decrease at a slower rate.
def optimise_k_means(data, max_k):
means = []
inertias = []
for k in range(1, max_k):
kmeans = KMeans(n_clusters=k)
kmeans.fit(data)
means.append(k)
inertias.append(kmeans.inertia_)
# Generate the elbow plot
fig = plt.subplots(figsize=(10,5))
plt.plot(means, inertias, 'o-')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.grid(True)
plt.show()
optimise_k_means(df[number_cols], 10)
From the graph 7 seems perfect for us as then the line starts damping into an asymptote.
Clustering with K-Means
Here, the simple K-means clustering algorithm was used to divide his dataset into seven clusters based on the numerical audio features of each genres.
kmeans = KMeans(n_clusters=7)
kmeans.fit(df[number_cols])
KMeans(n_clusters=7)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(n_clusters=7)
df['kmeans_7'] = kmeans.labels_
df
| track_id | artists | album_name | track_name | danceability | energy | loudness | mode | speechiness | acousticness | liveness | valence | tempo | time_signature | track_genre | kmeans_7 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5SuOikwiRyPMVoIQDJUgSV | Gen Hoshino | Comedy | Comedy | 0.591433 | -0.745560 | 0.252185 | -1.296546 | 0.459572 | -0.821385 | 0.847224 | 0.932025 | -1.142710 | 4 | acoustic | 1 |
| 1 | 4qPNDBW1i3p13qLCt0Ki3A | Ben Woodward | Ghost (Acoustic) | Ghost - Acoustic | -0.924730 | -1.935649 | -1.863215 | 0.771280 | -0.100629 | 1.886861 | -0.573825 | -0.828263 | -1.491748 | 4 | acoustic | 3 |
| 2 | 1iJBSr7s7jYXzM8EGcbK5b | Ingrid Michaelson;ZAYN | To Begin Again | To Begin Again | -0.818125 | -1.157048 | -0.350428 | 0.771280 | -0.273644 | -0.281437 | -0.485355 | -1.405857 | -1.530474 | 4 | acoustic | 6 |
| 3 | 6lfxq3CG4xtTiEg7opyCyx | Kina Grannis | Crazy Rich Asians (Original Motion Picture Sou... | Can't Help Falling In Love | -1.836797 | -2.364888 | -2.121363 | 0.771280 | -0.436581 | 1.829161 | -0.402414 | -1.315485 | 1.997657 | 3 | acoustic | 3 |
| 4 | 5vjLSffimiIP26QG5WcN2K | Chord Overstreet | Hold On | Hold On | 0.247927 | -0.818176 | -0.339740 | 0.771280 | -0.299681 | 0.505103 | -0.673906 | -1.221184 | -0.070561 | 4 | acoustic | 6 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 110995 | 7sLknEg8aVr0m5ZuCja7b3 | NG Rezonance;PHD | Syncopy Radio Edits, Vol. 1 | Divergence - Radio Edit | -2.535654 | 1.400634 | 0.060591 | -1.296546 | 0.032912 | -0.889714 | -0.924387 | -1.512732 | 0.600603 | 4 | trance | 0 |
| 110996 | 6veycwSGozeHSFQ6fbr5dC | NG Rezonance;PHD | Syncopy Radio Edits, Vol. 1 | Forgotten - Radio Edit | -0.427239 | 1.368360 | 0.139447 | 0.771280 | -0.318158 | -0.916711 | -0.441120 | -0.372474 | 0.599833 | 4 | trance | 0 |
| 110997 | 0MLEzWJQcRkc5IMAqucPbV | NG Rezonance;Begbie | Syncopy Radio Edits, Vol. 1 | Feel The Panic - Radio Edit | -0.604915 | 1.408702 | 0.752549 | 0.771280 | 0.080785 | -0.918667 | -0.302886 | -1.628251 | 0.935349 | 4 | trance | 0 |
| 110998 | 0cRNPYxzXLNLQd1g4kKYS6 | NG Rezonance | Syncopy Radio Edits, Vol. 1 | Fate - Instrumental Radio Edit | -0.948420 | 1.352224 | 0.875975 | -1.296546 | 0.050550 | -0.918932 | -0.098299 | -1.617249 | 0.937089 | 3 | trance | 0 |
| 110999 | 2dDE3WCSj2cELFYO1IfECD | NG Rezonance | Syncopy Radio Edits, Vol. 1 | Deception - NG Rezonance 2013 Radio Edit | -0.634527 | 1.376429 | 0.498434 | -1.296546 | -0.072913 | -0.919005 | 0.570755 | -0.922564 | 0.767926 | 4 | trance | 0 |
68999 rows × 16 columns
Visulaising the clusters identified by KMeans clustering using t-SNE (t-distributed Stochastic Neighbor Embedding). It is a dimensionality reduction technique that helps in visualizing high-dimensional datasets by mapping them to two or three dimensions while preserving their structural integrity.
# Visualizing the Clusters with t-SNE
import plotly.express as px
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
# Pipeline to standardize the data and then apply t-SNE
tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])
# Fiting the pipeline to the data and transform it to get the t-SNE embedding
genre_embedding = tsne_pipeline.fit_transform(df[number_cols])
# DataFrame to hold the t-SNE projection
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
# Adding the genre and cluster information
projection['genres'] = df['track_genre']
projection['cluster'] = df['kmeans_7']
# Scatter plot to visualize the t-SNE projection colored by clusters
fig = px.scatter(
projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()
[t-SNE] Computing 91 nearest neighbors... [t-SNE] Indexed 68999 samples in 0.102s... [t-SNE] Computed neighbors for 68999 samples in 22.347s... [t-SNE] Computed conditional probabilities for sample 1000 / 68999 [t-SNE] Computed conditional probabilities for sample 2000 / 68999 [t-SNE] Computed conditional probabilities for sample 3000 / 68999 [t-SNE] Computed conditional probabilities for sample 4000 / 68999 [t-SNE] Computed conditional probabilities for sample 5000 / 68999 [t-SNE] Computed conditional probabilities for sample 6000 / 68999 [t-SNE] Computed conditional probabilities for sample 7000 / 68999 [t-SNE] Computed conditional probabilities for sample 8000 / 68999 [t-SNE] Computed conditional probabilities for sample 9000 / 68999 [t-SNE] Computed conditional probabilities for sample 10000 / 68999 [t-SNE] Computed conditional probabilities for sample 11000 / 68999 [t-SNE] Computed conditional probabilities for sample 12000 / 68999 [t-SNE] Computed conditional probabilities for sample 13000 / 68999 [t-SNE] Computed conditional probabilities for sample 14000 / 68999 [t-SNE] Computed conditional probabilities for sample 15000 / 68999 [t-SNE] Computed conditional probabilities for sample 16000 / 68999 [t-SNE] Computed conditional probabilities for sample 17000 / 68999 [t-SNE] Computed conditional probabilities for sample 18000 / 68999 [t-SNE] Computed conditional probabilities for sample 19000 / 68999 [t-SNE] Computed conditional probabilities for sample 20000 / 68999 [t-SNE] Computed conditional probabilities for sample 21000 / 68999 [t-SNE] Computed conditional probabilities for sample 22000 / 68999 [t-SNE] Computed conditional probabilities for sample 23000 / 68999 [t-SNE] Computed conditional probabilities for sample 24000 / 68999 [t-SNE] Computed conditional probabilities for sample 25000 / 68999 [t-SNE] Computed conditional probabilities for sample 26000 / 68999 [t-SNE] Computed conditional probabilities for sample 27000 / 68999 [t-SNE] Computed conditional probabilities for sample 28000 / 68999 [t-SNE] Computed conditional probabilities for sample 29000 / 68999 [t-SNE] Computed conditional probabilities for sample 30000 / 68999 [t-SNE] Computed conditional probabilities for sample 31000 / 68999 [t-SNE] Computed conditional probabilities for sample 32000 / 68999 [t-SNE] Computed conditional probabilities for sample 33000 / 68999 [t-SNE] Computed conditional probabilities for sample 34000 / 68999 [t-SNE] Computed conditional probabilities for sample 35000 / 68999 [t-SNE] Computed conditional probabilities for sample 36000 / 68999 [t-SNE] Computed conditional probabilities for sample 37000 / 68999 [t-SNE] Computed conditional probabilities for sample 38000 / 68999 [t-SNE] Computed conditional probabilities for sample 39000 / 68999 [t-SNE] Computed conditional probabilities for sample 40000 / 68999 [t-SNE] Computed conditional probabilities for sample 41000 / 68999 [t-SNE] Computed conditional probabilities for sample 42000 / 68999 [t-SNE] Computed conditional probabilities for sample 43000 / 68999 [t-SNE] Computed conditional probabilities for sample 44000 / 68999 [t-SNE] Computed conditional probabilities for sample 45000 / 68999 [t-SNE] Computed conditional probabilities for sample 46000 / 68999 [t-SNE] Computed conditional probabilities for sample 47000 / 68999 [t-SNE] Computed conditional probabilities for sample 48000 / 68999 [t-SNE] Computed conditional probabilities for sample 49000 / 68999 [t-SNE] Computed conditional probabilities for sample 50000 / 68999 [t-SNE] Computed conditional probabilities for sample 51000 / 68999 [t-SNE] Computed conditional probabilities for sample 52000 / 68999 [t-SNE] Computed conditional probabilities for sample 53000 / 68999 [t-SNE] Computed conditional probabilities for sample 54000 / 68999 [t-SNE] Computed conditional probabilities for sample 55000 / 68999 [t-SNE] Computed conditional probabilities for sample 56000 / 68999 [t-SNE] Computed conditional probabilities for sample 57000 / 68999 [t-SNE] Computed conditional probabilities for sample 58000 / 68999 [t-SNE] Computed conditional probabilities for sample 59000 / 68999 [t-SNE] Computed conditional probabilities for sample 60000 / 68999 [t-SNE] Computed conditional probabilities for sample 61000 / 68999 [t-SNE] Computed conditional probabilities for sample 62000 / 68999 [t-SNE] Computed conditional probabilities for sample 63000 / 68999 [t-SNE] Computed conditional probabilities for sample 64000 / 68999 [t-SNE] Computed conditional probabilities for sample 65000 / 68999 [t-SNE] Computed conditional probabilities for sample 66000 / 68999 [t-SNE] Computed conditional probabilities for sample 67000 / 68999 [t-SNE] Computed conditional probabilities for sample 68000 / 68999 [t-SNE] Computed conditional probabilities for sample 68999 / 68999 [t-SNE] Mean sigma: 0.000000 [t-SNE] KL divergence after 250 iterations with early exaggeration: 101.086151 [t-SNE] KL divergence after 1000 iterations: 2.287273
import numpy as np
# Song clustering Pipeline
song_cluster_pipeline = Pipeline([
('scaler', StandardScaler()), # Step 1: Standardizing
('kmeans', KMeans(n_clusters=7, verbose=False)) #K-means clustering
], verbose=False)
# Select numerical columns from the data
X = df.select_dtypes(np.number)
number_cols = list(X.columns)
# Fit the pipeline to the data
song_cluster_pipeline.fit(X)
# Predict cluster labels for the data
song_cluster_labels = song_cluster_pipeline.predict(X)
# Assign cluster labels to the original dataset
df['cluster_label'] = song_cluster_labels
# Visualizing the Clusters with PCA
from sklearn.decomposition import PCA
# Pipeline for PCA
pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
# FitING the pipeline to the data and transform it to get the PCA embedding
song_embedding = pca_pipeline.fit_transform(X)
# DataFrame to hold the PCA projection
projection = pd.DataFrame(song_embedding, columns=['x', 'y'])
# track title and cluster information to the projection DataFrame
projection['title'] = df['track_name']
projection['cluster'] = df['cluster_label']
# Scatter plot to visualize the PCA projection colored by clusters
fig = px.scatter(
projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()
Based on the analysis and visualizations, it's evident that songs belonging to similar genres tend to cluster together, indicating a correlation between similarity and the proximity of data points. This observation is intuitive as songs within the same genre typically share similar musical characteristics and are often produced within similar time periods. Leveraging this insight, we can develop a recommendation system by using the data points of songs a user has listened to and suggesting songs that are located near those data points.
Spotipy, a Python client for the Spotify Web API, facilitates the retrieval of data and querying of Spotify's extensive catalog of songs.
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
# Replace 'secret' with Spotify client secret
cid = '778a29e66e464a668d6ed5419a05ec3d'
secret = '937d60c66cae42a980dcdbd9fe4968bd'
# Client credentials manager
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
# Spotify client
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
The code begins by utilizing the modules to create a dictionary that provides a default value when a key is not present. The find_song function is defined to take a single argument, name, representing the song name to search for. Within this function, a defaultdict named song_data is initialized with list as its default factory function, ensuring that non-existent keys return an empty list. The function then filters a DataFrame df to create song_info, containing rows where the track_name matches the provided name. If no matching song is found (song_info.empty), the function returns None. Otherwise, the song’s track name and artists are appended to their respective lists in song_data. The function also iterates over a predefined list of audio feature keys, appending corresponding values from song_info to song_data. Finally, a DataFrame is created from song_data and returned. An example usage of the function is provided, where it is called with the song name "Come As You Are", and the result is printed. The accompanying markdown report provides a comprehensive overview of the retrieved song details, including the artist and various audio features.
from collections import defaultdict
import pandas as pd
def find_song(artist, track_name):
# Initializing a defaultdict
song_data = defaultdict(list)
# Filter the DataFrame by artist and track name
song_info = df[(df['track_name'] == track_name) & (df['artists'] == artist)]
# If no matching song is found, return None
if song_info.empty:
return None
# Adding song information to the dictionary
song_data['name'].append(song_info.iloc[0]['track_name'])
song_data['artists'].append(song_info.iloc[0]['artists'])
# Adding audio features to the dictionary
for key in ['acousticness', 'danceability', 'energy', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'valence']:
song_data[key].append(song_info.iloc[0][key])
# Creating a DataFrame from the dictionary
return pd.DataFrame(song_data)
# Example usage:
artist_name = "Nirvana"
track_name = "Come As You Are"
song_details = find_song(artist_name, track_name)
print(song_details)
name artists acousticness danceability energy liveness \ 0 Come As You Are Nirvana -0.918685 -0.450929 0.718854 -0.625801 loudness mode speechiness tempo valence 0 0.433696 -1.296546 -0.415584 -0.06467 0.240484
The similarity is determined based on the mean vector of the input songs and the cosine distance between this vector and the vectors of other songs in the dataset.
Get Song Data: Retrieves data for a specific song from the dataset.
Get Mean Vector: Calculates the mean vector for a list of songs. It uses the SimpleImputer to handle any missing values in the song data. The mean vector represents the average feature values of the input songs.
Flatten Dictionary List: Converts a list of dictionaries into a dictionary of lists.
Recommend Songs: Recommends a specified number of songs based on the input song list. It calculates the mean vector for the input songs, scales the data, computes pairwise cosine distances, and identifies the closest songs in the dataset. It then returns the recommended songs.
from collections import defaultdict
from sklearn.metrics import pairwise_distances
from sklearn.impute import SimpleImputer
import numpy as np
# Function to get song data from the dataset
def get_song_data(song, your_data):
try:
# Find the song in the dataset
song_data = your_data[(your_data['track_name'] == song['name']) & (your_data['artists'] == song['artist'])].iloc[0]
return song_data
except IndexError:
return None
# Function to calculate the mean vector of a list of songs
def get_mean_vector(song_list, your_data):
song_vectors = []
imputer = SimpleImputer(strategy='mean') # Impute missing values with the mean
for song in song_list:
song_data = get_song_data(song, your_data)
if song_data is None:
print('Warning: {} by {} does not exist in your dataset'.format(song['name'], song['artist']))
continue
# Impute missing values and flatten the song vector
song_data_imputed = imputer.fit_transform(song_data[number_cols].values.reshape(1, -1))
song_vectors.append(song_data_imputed.flatten())
# Create a matrix from song vectors and calculate the mean vector
song_matrix = np.array(list(song_vectors))
return np.mean(song_matrix, axis=0)
# Function to flatten a list of dictionaries into a dictionary of lists
def flatten_dict_list(dict_list):
flattened_dict = defaultdict(list)
for key in dict_list[0].keys():
flattened_dict[key] = []
for dictionary in dict_list:
for key, value in dictionary.items():
flattened_dict[key].append(value)
return flattened_dict
# Function to recommend songs based on a list of input songs
def recommend_songs(song_list, your_data, n_songs=10):
metadata_cols = ['track_name', 'artists']
song_dict = flatten_dict_list(song_list)
# Calculate the mean vector for the input songs
song_center = get_mean_vector(song_list, your_data)
scaler = song_cluster_pipeline.steps[0][1]
# Only include relevant columns for scaling
scaled_data = scaler.transform(your_data[number_cols])
scaled_song_center = scaler.transform(song_center.reshape(1, -1))
distances = pairwise_distances(scaled_song_center, scaled_data, metric='cosine')
index = list(np.argsort(distances)[:, :n_songs][0])
rec_songs = your_data.iloc[index]
rec_songs = rec_songs[~rec_songs['track_name'].isin(song_dict['name']) & ~rec_songs['artists'].isin(song_dict['artist'])]
return rec_songs[metadata_cols].to_dict(orient='records')
# Example usage:
song_list = [{'name': 'Moonlight', 'artist': 'XXXTENTACION'},
{'name': 'PTSD', 'artist': 'G Herbo'},
{'name': 'Lucid Dreams', 'artist': 'Juice WRLD'}] # List of songs
# Get recommended songs
recommended_songs = recommend_songs(song_list, df, n_songs=10)
print("Recommended Songs:")
for index, song in enumerate(recommended_songs, start=1):
print(f"{index}. {song['track_name']} by {song['artists']}")
Warning: PTSD by G Herbo does not exist in your dataset Warning: Lucid Dreams by Juice WRLD does not exist in your dataset Recommended Songs: 1. 03' by Sainte 2. Ganha o Mundo by MC Hariel;Mc Dimenor Dr 3. Teu Herói by Weliton O Gordinho;Thiago Aquino 4. Passive Aggressive by Nate Traveller 5. bingo by Don L 6. Rebola Lentin (feat. Mc Kaio) by Bonde do gato preto;Mc Kaio 7. Sento de Repente by Bonde do gato preto